Advanced preprocessing: multiplets detection and metacells generation¶
Data description¶
For info on the data:
- Single cell transcriptomics dataset from paper published by Trevino et al. (Cell 2021) characterizing human fetal cortex at mid-gestation.)
- Paper
- Dataset interactive Viewer
- Samples: human fetal brain cortex at mid-gestation (4 subjects from PCW16 to PCW24)
- Sequencing method: single cellb RNASequencing (Chromium platform - 10x Genomics)
- Obtained number of cells: ~58,000
Notebook content¶
- Additional QC for robust analysis
- Multiplets detection
- Generation of metacells from kNN graph
Library loading¶
import scanpy as sc, anndata as ad, numpy as np, pandas as pd
import warnings
import yaml
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.sparse as sp
import statsmodels.api as sm
import scanpy as sc
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
warnings.filterwarnings('ignore')
import plotly.express as px
import plotly.io as pio
import itertools
import sys
pio.renderers.default = "jupyterlab"
import random
random.seed(1)
homeDir = os.getenv("HOME")
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
# import user defined functions from the utils folder
import matplotlib.pyplot as plt
sys.path.insert(1, "./utils/")
from CleanAdata import *
from SankeyOBS import *
scanpy==1.10.1 anndata==0.10.7 umap==0.5.6 numpy==1.26.4 scipy==1.11.4 pandas==2.2.2 scikit-learn==1.4.2 statsmodels==0.14.2 igraph==0.11.5 pynndescent==0.5.12
1. Data Load¶
data_path = '/group/brainomics/InputData/'
adata = sc.read_h5ad(data_path + 'Day1_2_TrevinoFiltNormAdata.h5ad')
# fix formatting for some metadata
adata.obs["Auth_Assay_formatted"] = adata.obs.Auth_Assay.str.replace(" ","_")
2. Additional QCs¶
First we compute again the quality metrics:
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
sc.pp.calculate_qc_metrics(
adata, qc_vars=["mt", "ribo"], inplace=True, log1p=True
)
sc.pl.embedding(adata, color=["cell_class", "cell_label","Leiden_Sel","Auth_Batch"], ncols=3, wspace=.5, size = 35, vmin='p1', vmax='p99', basis = "X_umap_nocorr")
What do you observe?
Hint
We can already observe how some of the original Cell_label appears to be batch-specific e.g., RG_early/Late and Some ExN clusters. Additionally one cluster of annotated InN (Leiden 11) is in between Excitatory and CGE/MGE interneuronsSince distances in the UMAP are not meaningful, we can plot the PCA to see if we have similar observations:
sc.pl.pca(adata, color=["cell_label","Leiden_Sel"], ncols=4, wspace=.5, size = 50, vmin='p1', vmax='p99')
sc.tl.rank_genes_groups(adata, groupby="Leiden_Sel", method="wilcoxon", groups=["11"], reference="6")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
sc.tl.rank_genes_groups(adata, groupby="Leiden_Sel", method="wilcoxon", groups=["11"], reference="8")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
ranking genes
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:08)
ranking genes
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:06)
2.2. Multiplets inspection via scDblFinder¶
- Generation of in silico doublets
- kNN classification of initial cells based on the in silico doublets
import
import rpy2.rinterface_lib.callbacks
import logging
anndata2ri.activate()
%load_ext rpy2.ipython
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
import os
from rpy2 import robjects
# Set R_HOME and R_LIBS_USER
# Set R_HOME and R_LIBS_USER
# Set R_HOME and R_LIBS_USER
os.environ['R_HOME'] = '/opt/R/4.3.1/lib/R/bin//R'
os.environ['R_LIBS_USER'] = '/opt/R/4.3.1/lib/R/library'
from rpy2 import robjects
custom_lib_paths = "/opt/R/4.3.1/lib/R/library"
robjects.r(f'.libPaths(c("{custom_lib_paths}", .libPaths()))')
# Clean anndata from unnecessary fields
# Clean anndata from unnecessary fields
# Clean anndata from unnecessary fields
sce = adata.copy()
sce = sce.copy()
sce.obs = sce.obs[["Auth_Sample.ID"]]
del sce.layers["lognormcounts"]
del sce.obsp
sce.layers["counts"] = sce.layers["counts"].astype(np.int32).todense()
sce.X = sce.layers["counts"].copy()
sce = sce.copy()
del sce.obsm
del sce.varm
del sce.uns
sce.var.index.name = None
sce.var = sce.var[["ensg"]]
# Run doublets detection
# Run doublets detection
# Run doublets detection
sce = anndata2ri.py2rpy(sce)
print(sce)
scDblFinder = rpy2.robjects.packages.importr('scDblFinder')
S4Vectors = rpy2.robjects.packages.importr('S4Vectors')
as_data_frame = rpy2.robjects.r['as.data.frame']
sce = scDblFinder.scDblFinder(sce, samples="Auth_Sample.ID")
# Save doublets info column
#sce.obs["scDblFinder.class"].to_csv("...", sep="\t")
# Improt doublets class information
DBLs = pd.read_csv("./utils/DoubletsClass.tsv", delimiter="\t", index_col=0)
adata.obs = pd.concat([adata.obs, DBLs], axis = 1).loc[adata.obs_names]
sc.pl.embedding(adata, color=["scDblFinder.class","Leiden_Sel"], ncols=3, wspace=.3, size = 35, vmin='p1', vmax='p99', basis = "X_umap_nocorr")
plotSankey(adata, covs=["Leiden_Sel","scDblFinder.class"])
adata = adata[~adata.obs["Leiden_Sel"].isin(["11"])]
adata = adata[adata.obs["scDblFinder.class"].isin(["singlet"])]
2.3 Additional filtering¶
# refine filtering
sc.pl.scatter(adata, x="pct_counts_mt",y="n_genes_by_counts", size=40, color="pct_counts_ribo")
adata = adata[adata.obs["n_genes_by_counts"] >= 2000]
2.4 Batches inspection¶
An important quality check to perform is to inspect the impact of technical variables of the datasets on the distribution of the cells in lower dimensional space and whether these could be a confounder. Ideally, when preparing an experimental design you want to have a similar representation of your biological samples across batches.
For this dataset we have the metadata key Auth_Batch that define batches but also divides samples based on post-conceptional week, thus it will be hard to distinguish the batch effect from the biological effect given by the sampling of different time points. This will need to be taken into consideration when interpreting the results.
fig, axes = plt.subplots(1,len(adata.obs.Auth_Batch.unique()), figsize=(20, 4), dpi=200)
for group in enumerate(adata.obs.Auth_Batch.unique()):
SampleIDs = adata.obs.loc[adata.obs.Auth_Batch == group[1],"Auth_Sample.ID"].unique().tolist()
axes[group[0]] = sc.pl.embedding(adata, size = 40, add_outline=True,ncols=2, color=["Auth_Sample.ID"],title="{} replicates".format(group[1]),
groups=SampleIDs, vmin='p1', vmax='p99', show=False, ax=axes[group[0]], basis = "umap_nocorr")
plt.subplots_adjust(wspace=.5)
fig.show()
sc.pl.pca(adata, color=["Auth_Sample.ID","Auth_Batch","Auth_Age","cell_label","Auth_Assay_formatted"], ncols=3, wspace=.4, size = 50, vmin='p1', vmax='p99')
plotSankey(adata, covs=["Auth_Age","Auth_Batch","Auth_Assay_formatted"])
2.4.1 PCA regressors to check variance associated with covariates¶
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
npcs = 6
covToTest = ["Auth_Sample.ID","Auth_Batch","Auth_Assay_formatted","Auth_Age","cell_label"]
def plotResiduals(adata, covToTest, npcs):
PCRegrDict = {}
sns.set_theme(style="ticks")
varianceSummary = pd.DataFrame()
for i in range(npcs):
for n,c in enumerate(covToTest):
# format the data
Dummies = pd.get_dummies(adata.obs[c])
PCRegr = (Dummies.T * adata.obsm["X_pca"][:,i].T).T.melt()
PCRegr = PCRegr[PCRegr["value"] != 0]
PCRegr["variable"] = PCRegr["variable"].astype("category")
PCRegr["cov"] = c
PCRegrDict["PC:{}_COV:{}".format(i+1,c)] = PCRegr.copy()
sns.despine(offset=30)
PCRegrModel = pd.get_dummies(PCRegrDict["PC:{}_COV:{}".format(i+1,c)], "variable").copy()
# define the regression formula
formula = "".join(["value ~ "," + ".join(["C("+c+")" for c in PCRegrModel.columns[1:-1].tolist()])])
# fit the regression
fit = ols(formula, data=PCRegrModel).fit()
# fit.rsquared_adj
# get the residuals
PCRegr["rsquared_adj"] = fit.rsquared_adj
PCRegr["PC"] = i
varianceSummary = pd.concat([varianceSummary,PCRegr ], ignore_index=True)
CovOrder = {i:varianceSummary.drop_duplicates(["PC","cov"])[varianceSummary.drop_duplicates(["PC","cov"])["PC"] == i].sort_values("rsquared_adj", ascending=False)["cov"].tolist() for i in varianceSummary["PC"].unique().tolist()}
for i in range(npcs):
fig, axes = plt.subplots(1,len(covToTest), figsize=(40,4), dpi=300, sharex=False,sharey=False)
plt.subplots_adjust(wspace=.5)
#adata.obsm["X_pca"]
for n,c in enumerate(CovOrder[i]):
PCRegr = varianceSummary[(varianceSummary["PC"] == i) & (varianceSummary["cov"] == c)]
sns.boxplot(data=PCRegr, x="variable", y="value", ax = axes[n],whis=[0, 100],
palette={i:adata.uns[c+"_colors"][adata.obs[c].cat.categories.tolist().index(i)] for i in PCRegr["variable"].unique().tolist()}
#order=[i for i in PlotOrder[c] if i in PCRegr["variable"].unique().tolist()],
#hue_order=[i for i in PlotOrder[c] if i in PCRegr["variable"].unique().tolist()])
)
sns.stripplot(data=PCRegr, x="variable", y="value", size=4, color=".3",ax = axes[n],
#order=[i for i in PlotOrder[c] if i in PCRegr["variable"].unique().tolist()],
#hue_order=[i for i in PlotOrder[c] if i in PCRegr["variable"].unique().tolist()])
)
axes[n].title.set_text('Covariate:{}'.format(c))
axes[n].set_xlabel(c, fontsize=20)
axes[n].set_ylabel("PC{} embeddings".format(i+1), fontsize=20)
sns.despine(offset=30)
fit.rsquared_adj = PCRegr["rsquared_adj"].values[0]
axes[n].text(0.5, 0, "rsquared_adj:{}".format(round(fit.rsquared_adj,2)), horizontalalignment='center', verticalalignment='center',transform=axes[n].transAxes)
axes[n].tick_params(axis="x", rotation=45)
plt.xticks(rotation=45)
fig.autofmt_xdate(rotation=45)
plotResiduals(adata, covToTest, npcs)
<Figure size 640x480 with 0 Axes>
3. kNN-based metacells aggregation¶
- Reduced computational burden
- Reduced data sparsity and improved signal-to-noise ratio
- Mitigation of transcriptional spikes(??)
- Do not rely on previous clustering or annotation
After the aggregation we obtain an object similar to the previous one but with a lower number of cells, each one belonging to a neighbourhood defined by the aggregation method.
3.1 Data preparation¶
pd.crosstab(adata.obs.cell_label,adata.obs.cell_class).T
| cell_label | CycProg | Endo | ExN_N1 | ExN_N2 | ExN_N3 | ExN_N4 | ExN_N5 | ExN_N6 | ExN_N7 | ExN_N8 | ... | In_MGE | Microglia | OPC_Oligo | Pericytes | RBC | RG_early | RG_late | SubPlate | VLMC | tRG |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| cell_class | |||||||||||||||||||||
| Pg | 1789 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1527 | 1350 | 0 | 0 | 424 |
| Other | 0 | 46 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 175 | 1 | 0 | 0 | 0 | 4 | 0 |
| ExN | 0 | 0 | 1470 | 2519 | 2 | 3018 | 4741 | 1328 | 854 | 441 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 326 | 0 | 0 |
| IPC | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| InN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 369 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Microglia | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 175 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| OPC_Oligo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 489 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
7 rows × 23 columns
#Let's remove non relevant cell types and collapse some others
adata = adata[~adata.obs["cell_class"].isin(['Microglia','Other','InN'])]
pd.crosstab(adata.obs.cell_label,adata.obs.cell_class).T
| cell_label | CycProg | ExN_N1 | ExN_N2 | ExN_N3 | ExN_N4 | ExN_N5 | ExN_N6 | ExN_N7 | ExN_N8 | GliaPg | IPC | OPC_Oligo | RG_early | RG_late | SubPlate | tRG |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| cell_class | ||||||||||||||||
| Pg | 1789 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1179 | 0 | 0 | 1527 | 1350 | 0 | 424 |
| ExN | 0 | 1470 | 2519 | 2 | 3018 | 4741 | 1328 | 854 | 441 | 0 | 0 | 0 | 0 | 0 | 326 | 0 |
| IPC | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 980 | 0 | 0 | 0 | 0 | 0 |
| OPC_Oligo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 489 | 0 | 0 | 0 | 0 |
3.2 Metacells definition and aggregation¶
We propose a method that follows these steps:
First of all, before the aggregation, we need to define the kNN graph and its parameter. We can tune it by defining the number of Highly Variable Genes used for Principal Component Analysis and, of those, decide how many will be used to compute the distance between the cells. Since we are computing a kNN, we will need to decide the number of neighbours (k) to which a cell will be connected to.
For each sample, we compute the kNN graph with Scanpy's function
sc.pp.neighborsThe computed connectivities will be used as values to cluster the cells by applying a [kMeans clustering (https://scikit-learn.org/1.5/modules/clustering.html#k-means) and taking the average normalized expression value of each gene in each cluster. We save these values in a new AnnData object, which .X matrix will be metacells x genes and concatenate the results for all the samples. Here we set 400 metacells for each sample and we have 6 samples, so we'll have 3200 metacells in the end.
We assign a label to each cluster based on the the most common cell label for each cluster
What impact do you expect to have the change of the initial number of highly variable genes?
Hints
- A higher number of highly variable genes will:
- - increase the computational burden
- - increase the risk of including genes that are not useful to define differences across cells (ex. genes that are in common to more cell types or are widely expressed)
- - increase the risk of having metacells driven by technical confounder
- A lower number of highly variable genes will:
- - decrease the computational burden
- - increase the risk of excluding useful genes for the definition of a particular cell states
- - increase the risk of not capturing subtile differences between cell states and having less pure metacells (ex. composed of a mix of cells from different cell states)
What impact do you expect to have the change of the initial number of principal components?
Hints
- Including a higher number of principal components will:
- - increase the computational burden
- - increase the risk of including PC which variance is driven by technical variation
- - increase the risk of metacells to be representative of a batch and not of the cell states
- Including a lower number of principal components will:
- - decrease the computational burden
- - increase the risk of excluding biological information
- - increase the risk of capturing not enough complexity and loosing rarer cell states
NewAdataList = []
import os
os.environ["OMP_NUM_THREADS"] = "4"
n_neighbors = 30
n_pcs = 8
n_top_genes = 2000
n_clusters = 400
for sample in adata.obs["Auth_Sample.ID"].unique().tolist():
adataLocal = adata[adata.obs["Auth_Sample.ID"] == sample].copy()
ncells = adataLocal.shape[0]
filenameBackup = "./utils/kmeansAggregation/Kmeans{}.{}.{}topgenes.{}neighbs.{}pcs.{}Ks.csv".format(ncells,sample,n_top_genes,n_neighbors,n_pcs,n_clusters )
# Check if the kmeans was precomputed
if os.path.isfile(filenameBackup):
print("Retrieving precomputed kmeans classes \n")
KmeansOBS = pd.read_csv(filenameBackup, sep="\t", index_col=0)
adataLocal.obs['kmeans_clusters'] = KmeansOBS
else:
#If not run k-means metacells aggregation
sc.pp.highly_variable_genes(adataLocal, n_top_genes=n_top_genes, flavor="seurat")
sc.tl.pca(adataLocal)
# Step 1: Compute the KNN graph
sc.pp.neighbors(adataLocal, n_neighbors = n_neighbors , transformer='pynndescent', n_pcs=n_pcs, metric="euclidean") # Adjust n_neighbors as needed
# Step 2: Extract the connectivity matrix from AnnData
connectivities = adataLocal.obsp['distances'].toarray()
# Step 3: Apply K-Means clustering with a fixed number of clusters
print("Computing kmeans")
adataLocal.obs['kmeans_clusters'] = KMeans(n_clusters=n_clusters, random_state=0).fit_predict(connectivities)
print("Storing kmeans classes \n")
adataLocal.obs["kmeans_clusters"].to_csv(filenameBackup, sep="\t")
adataLocal.X = adataLocal.layers["counts"].copy()
# Normalize counts
sc.pp.normalize_total(adataLocal, target_sum=2e4)
# Create combined AnnData objects efficiently and add to NewAdataList
# Step 2: Create dummy variables for clusters
dummy_clusters = pd.get_dummies(adataLocal.obs['kmeans_clusters'])
# Step 3: Dot product for mean aggregation of expression values
# Each column in `cluster_aggregated_X` represents mean expression for a cluster
X_dense = adataLocal.X.A if hasattr(adataLocal.X, "A") else adataLocal.X
cluster_aggregated_X = dummy_clusters.T.dot(X_dense)
cluster_aggregated_median_X = np.zeros((dummy_clusters.shape[1], X_dense.shape[1]))
for cluster_idx in range(dummy_clusters.shape[1]):
# Select cells belonging to the current cluster
cluster_cells = X_dense[dummy_clusters.iloc[:, cluster_idx].values == 1]
# Compute the median across cells within the cluster
cluster_aggregated_median_X[cluster_idx, :] = np.median(cluster_cells, axis=0)
# Convert to AnnData structure
adataAggregated = ad.AnnData(X=cluster_aggregated_X)
adataAggregated.layers["median"] = cluster_aggregated_median_X # we save an additional layer having as aggregated value the median of the gene expression
adataAggregated.var_names = adataLocal.var_names
adataAggregated.obs['kmeans_clusters'] = dummy_clusters.columns
# Step 4: Aggregating labels and metadata
# Get the most common cell label for each cluster
adataAggregated.obs['AggregatedClass'] = (
adataLocal.obs.groupby('kmeans_clusters')['cell_class']
.agg(lambda x: x.value_counts().idxmax())
.reindex(dummy_clusters.columns)
.values
)
adataAggregated.obs['AggregatedLabel'] = (
adataLocal.obs.groupby('kmeans_clusters')['cell_label']
.agg(lambda x: x.value_counts().idxmax())
.reindex(dummy_clusters.columns)
.values
)
adataAggregated.obs_names = list(sample+"_"+adataAggregated.obs["kmeans_clusters"].astype(str))
# Assign metadata fields with identical values across each cluster
for obsMD in [col for col in adataLocal.obs.columns if len(adataLocal.obs[col].unique()) == 1]:
adataAggregated.obs[obsMD] = adataLocal.obs[obsMD].iloc[0]
# Add the aggregated AnnData to NewAdataList
NewAdataList.append(adataAggregated)
Retrieving precomputed kmeans classes
normalizing counts per cell
finished (0:00:00)
Retrieving precomputed kmeans classes
normalizing counts per cell
finished (0:00:00)
Retrieving precomputed kmeans classes
normalizing counts per cell
finished (0:00:00)
Retrieving precomputed kmeans classes
normalizing counts per cell
finished (0:00:00)
Retrieving precomputed kmeans classes
normalizing counts per cell
finished (0:00:00)
Retrieving precomputed kmeans classes
normalizing counts per cell
finished (0:00:00)
Retrieving precomputed kmeans classes
normalizing counts per cell
finished (0:00:00)
Retrieving precomputed kmeans classes
normalizing counts per cell
finished (0:00:00)
Save¶
# Save the new aggregated anndata
from scipy import sparse
CombinedAdata = ad.concat(NewAdataList)
# Conevert to sparse matrix
CombinedAdata.X = sparse.csr_matrix(CombinedAdata.X)
CombinedAdata.layers["median"] = sparse.csr_matrix(CombinedAdata.layers["median"])
print(adata.shape)
(22437, 16324)
print(CombinedAdata.shape)
(3200, 16324)
CombinedAdata.write_h5ad("./1_CombinedMetaCells.h5ad")